geolocations[
, .(
`Image Count` = .N
)
, keyby = .(`Image Available Ind` = ScanAvailable)
] %>%
kable
| Image Available Ind | Image Count |
|---|---|
| FALSE | 348779 |
| TRUE | 200924 |
geolocations[
, .(Flight_Geolocation_Ind = ifelse(sum(ScanAvailable) > 0, 1, 0))
, keyby = .(FlightID)
][
, .(`Flight Count` = .N)
, keyby = .(`Flight Geolocation Ind` = Flight_Geolocation_Ind)
] %>%
kable
| Flight Geolocation Ind | Flight Count |
|---|---|
| 0 | 423 |
| 1 | 1288 |
(geolocations[
, .(Image_Count = .N, Geolocation_Count = sum(ScanAvailable))
, keyby = .(FlightID)
][
, .(
Pct_Geolocated_Images = ifelse(
Geolocation_Count == 0,
-1,
floor(Geolocation_Count / Image_Count * 100)
)
)
] %>%
ggplot(
.,
aes(x = Pct_Geolocated_Images)
) +
geom_bar() +
labs(x = "Portions of each flight's images that are geolocated") +
ylab('Flight Frequency') +
theme_bw()) %>%
ggplotly
Note: Going forward in this EDA, only the flights and images with available geolocations are considered.
geolocations <- geolocations[(ScanAvailable)]
geolocations[
, .(
Distinct_Date_Count = data.table::uniqueN(Date)
)
, keyby = .(FlightID)
][
, .(`Flight Frequency` = .N)
, keyby = .(`Distinct Date Count` = Distinct_Date_Count)
] %>%
kable
| Distinct Date Count | Flight Frequency |
|---|---|
| 1 | 1276 |
| 2 | 7 |
| 3 | 1 |
| 6 | 1 |
| 73 | 1 |
| 83 | 1 |
| 114 | 1 |
flights.multidate <- geolocations[
, .(
dateCount = data.table::uniqueN(Date)
)
, keyby = .(FlightID)
][
dateCount > 1
]
flights.multidate[
geolocations
, on = .(FlightID)
, nomatch = FALSE
][
, .(
Date.min = min(Date),
Date.max = max(Date)
)
, keyby = .(FlightID)
][
, .(
FlightID,
`Date (min)` = Date.min,
`Date (max)` = Date.max,
`Date Range` = Date.max - Date.min + 1
)
][
order(`Date Range`)
] %>%
kable
| FlightID | Date (min) | Date (max) | Date Range |
|---|---|---|---|
| C_6650 | 1940-09-26 | 1940-09-27 | 2 |
| AMI_VEN_75 | 1975-12-02 | 1975-12-09 | 8 |
| AMI_LA_82 | 1982-01-23 | 1982-01-31 | 9 |
| TA_CF | 1966-12-16 | 1966-12-27 | 12 |
| AMI_LA_86 | 1986-03-22 | 1986-05-04 | 44 |
| AMI_SBD_85 | 1985-01-18 | 1985-03-25 | 67 |
| AMI_SD_77 | 1977-01-17 | 1977-07-01 | 166 |
| CAS_PLA | 1962-01-01 | 1962-07-28 | 209 |
| CAS_3390 | 1972-06-14 | 1973-01-23 | 224 |
| NAPP | 1987-06-16 | 1990-09-07 | 1180 |
| NAPP_2C | 1992-08-26 | 1996-09-30 | 1497 |
| NAPP_3C | 1998-08-03 | 2003-06-22 | 1785 |
Note: Going forward in this EDA, filtering out the 12 flights identifed above, unless otherwise stated.
geolocations.singledate_flights <- geolocations[
!flights.multidate
, on = .(FlightID)
]
(geolocations.singledate_flights[
, .(Image_Count = .N)
, keyby = .(Year)
] %>%
ggplot(
.,
aes(x = Year, y = Image_Count)
) +
geom_bar(stat = 'identity') +
ylab('Image Count') +
theme_bw()) %>%
ggplotly
(geolocations.singledate_flights[
, .(Image_Count = .N)
, keyby = .(FlightID, Year)
][
, .(
Image_Count.mean = mean(Image_Count)
)
, keyby = .(Year)
] %>%
ggplot(
.,
aes(x = Year, y = Image_Count.mean)
) +
geom_bar(stat = 'identity') +
ylab('Average Number of Images per Flights') +
theme_bw()) %>%
ggplotly
geolocations.singledate_flights[
, .(
`Quarter Count` = .N
)
, keyby = .(Quarter = quarter(Date))
] %>%
kable
| Quarter | Quarter Count |
|---|---|
| 1 | 71944 |
| 2 | 49476 |
| 3 | 20690 |
| 4 | 28807 |
year.min <- geolocations.singledate_flights[, min(Year)]
year.max <- geolocations.singledate_flights[, max(Year)]
year.levels <- seq(
from = year.min,
to = year.max,
by = 1
)
(geolocations.singledate_flights[
, .(
Quarter = factor(quarter(Date), levels = 1:4),
Year = factor(Year, levels = year.levels),
Date
)
] %>%
ggplot(
.,
aes(x = Year, fill = Quarter)
) +
geom_bar(position = 'fill') +
scale_y_continuous(labels = scales::percent) +
scale_fill_discrete(drop=FALSE) +
scale_x_discrete(drop=FALSE) +
theme_bw()) %>%
ggplotly %>%
plotly::layout(
xaxis = list(
tickangle = 90
),
legend = list(
orientation = "h",
xanchor = "center",
x = 0.575,
y = -0.2
)
)
geolocations.image_count <- geolocations.singledate_flights[
, .(
Image_Count = .N
)
, keyby = .(FlightID)
][
, .(
Frequency = .N
)
, keyby = .(Image_Count)
]
(geolocations.image_count %>%
ggplot(
.,
aes(x = Image_Count, y = Frequency)
) +
geom_bar(stat = 'identity') +
xlab('Image Count') +
xlim(c(0, 500)) +
theme_bw()) %>%
ggplotly
geolocations.scale_freq <- geolocations.singledate_flights[
, .(Scale_Count = .N)
, keyby = .(Scale)
][order(-Scale_Count)]
(geolocations.scale_freq[
Scale_Count > 100
, .(
Scale = factor(Scale, levels = geolocations.scale_freq[['Scale']]),
Scale_Count
)
] %>%
ggplot(
.,
aes(x = Scale, y = Scale_Count)
) +
geom_bar(stat = 'identity') +
ylab('Scale Count') +
theme_bw()) %>%
ggplotly %>%
plotly::layout(
xaxis = list(
tickangle = 90
)
)
geolocations.singledate_flights[
, .(Scale_Count = .N)
, keyby = .(Scale, Year)
][
order(Year, -Scale_Count)
, freq_rank := 1:.N
, by = .(Year)
][
freq_rank == 1
, .(Year, `Most Frequent Scale` = Scale)
][
order(Year)
] %>%
kable
| Year | Most Frequent Scale |
|---|---|
| 1927 | 18000 |
| 1928 | 18000 |
| 1929 | 14400 |
| 1930 | 24000 |
| 1931 | 12000 |
| 1932 | 14400 |
| 1933 | 14400 |
| 1934 | 13500 |
| 1935 | 14400 |
| 1936 | 24000 |
| 1937 | 20000 |
| 1938 | 20000 |
| 1939 | 20000 |
| 1940 | 20000 |
| 1941 | 24000 |
| 1942 | 20000 |
| 1943 | 20000 |
| 1944 | 10000 |
| 1945 | 14400 |
| 1946 | 20000 |
| 1947 | 24000 |
| 1948 | 20000 |
| 1949 | 20000 |
| 1950 | 20000 |
| 1951 | 20000 |
| 1952 | 20000 |
| 1953 | 20000 |
| 1954 | 20000 |
| 1955 | 14400 |
| 1956 | 20000 |
| 1957 | 20000 |
| 1958 | 20000 |
| 1959 | 20000 |
| 1960 | 14400 |
| 1961 | 20000 |
| 1962 | 20000 |
| 1963 | 20000 |
| 1964 | 12000 |
| 1965 | 12000 |
| 1966 | 12000 |
| 1967 | 20000 |
| 1968 | 12000 |
| 1969 | 12000 |
| 1970 | 20000 |
| 1971 | 12000 |
| 1972 | 7200 |
| 1973 | 12000 |
| 1974 | 24000 |
| 1975 | 12000 |
| 1976 | 24000 |
| 1977 | 24000 |
| 1978 | 40000 |
| 1979 | 36000 |
| 1980 | 24000 |
| 1981 | 12000 |
| 1982 | 42000 |
| 1983 | 36000 |
| 1984 | 31680 |
| 1985 | 31680 |
| 1986 | 36000 |
| 1987 | 24000 |
| 1988 | 36000 |
| 1989 | 24000 |
| 1990 | 36000 |
| 1991 | 36000 |
| 1992 | 40000 |
| 1993 | 34600 |
| 1994 | 24000 |
| 1995 | 12000 |
| 1997 | 24000 |
| 1998 | 42000 |
| 1999 | 10800 |
| 2000 | 10800 |
| 2001 | 12000 |
| 2002 | 15000 |
| 2003 | 30000 |
| 2004 | 21000 |
| 2005 | 12000 |
| 2006 | 3600 |
| 2007 | 24000 |
| 2008 | 24000 |
| 2010 | 12000 |
flights.selected <- geolocations.singledate_flights[
Year %in% 1952:1965
, .(
Image_Count = .N
)
, keyby = .(FlightID, Year, Scale)
][
Image_Count > 500 & Scale == 20000
, .(
FlightID,
Year,
Scale,
Image_Count,
Capped_Image_Count = ifelse(
Image_Count > 1500,
1500,
500
)
)
]
flights.selected[
order(-Image_Count)
] %>%
kable
| FlightID | Year | Scale | Image_Count | Capped_Image_Count |
|---|---|---|---|---|
| AXL_1953B | 1952 | 20000 | 6445 | 1500 |
| CAS_FRE | 1965 | 20000 | 2509 | 1500 |
| AXN_1953 | 1953 | 20000 | 2497 | 1500 |
| ABL_1956 | 1956 | 20000 | 2493 | 1500 |
| AXJ_1952 | 1952 | 20000 | 2357 | 1500 |
| AXM_1953B | 1953 | 20000 | 2337 | 1500 |
| BTM_1954 | 1954 | 20000 | 1597 | 1500 |
| AXL_1959 | 1959 | 20000 | 1373 | 500 |
| ABK_1952 | 1952 | 20000 | 1369 | 500 |
| CAS_SD | 1963 | 20000 | 1252 | 500 |
| AXJ_1959 | 1959 | 20000 | 1138 | 500 |
| ABF_1957 | 1957 | 20000 | 1046 | 500 |
| PAI_ABC | 1952 | 20000 | 990 | 500 |
| CSH_1953 | 1952 | 20000 | 950 | 500 |
| AXI_1959 | 1959 | 20000 | 944 | 500 |
| CAS_STAN | 1963 | 20000 | 912 | 500 |
| CIV_1956 | 1956 | 20000 | 837 | 500 |
| ABD_1957 | 1957 | 20000 | 826 | 500 |
| CAS_SCL | 1963 | 20000 | 823 | 500 |
| BTM_1961 | 1961 | 20000 | 769 | 500 |
| ABB_1957 | 1957 | 20000 | 730 | 500 |
| CAS_SAC | 1961 | 20000 | 670 | 500 |
| ABO_1957 | 1957 | 20000 | 611 | 500 |
| AXC_1952 | 1952 | 20000 | 592 | 500 |
| ABE_1957 | 1957 | 20000 | 590 | 500 |
| ABL_1952 | 1952 | 20000 | 568 | 500 |
| AXK_1953 | 1952 | 20000 | 539 | 500 |
| BUT_1958 | 1958 | 20000 | 508 | 500 |
geolocations.selected <- geolocations.singledate_flights[
flights.selected
, on = .(FlightID)
, nomatch = FALSE
]
flightSpatialPolygonDataFrame <- createFlightSpatialPolygonDataFrame(
geolocations.selected
)
flightPalette <- leaflet::colorFactor(
colorRampPalette(
RColorBrewer::brewer.pal(11,'Spectral')
)(nrow(flights.selected)),
factor(flights.selected[, FlightID])
)
leaflet(
flightSpatialPolygonDataFrame,
options = leafletOptions(preferCanvas = TRUE)
) %>%
addProviderTiles(
providers$CartoDB.PositronNoLabels,
group = 'CartoDB.NoLabels',
options = providerTileOptions(
updateWhenZooming = FALSE,
updateWhenIdle = TRUE
)
) %>%
addPolygons(
color = ~flightPalette(flight),
label = ~flightYear
)